Load required libraries.
library(arm)
library(ggplot2)
library(grid)
library(dplyr)
library(tidyr)
library(RMySQL)
library(RCurl)
source('data_loading.R')
Load in Wordbank common data.
wordbank <- src_mysql(host = "54.149.39.46", dbname="wordbank",
user = "wordbank", password = "wordbank")
common.tables <- get.common.tables(wordbank)
admins <- get.administration.data(common.tables$momed,
common.tables$child,
common.tables$instrumentsmap,
common.tables$administration) %>%
filter(form == "WS", age > 15, age < 32) %>%
mutate(age.group = cut(age, breaks = c(15, 19, 23, 27, 31)))
items <- get.item.data(common.tables$wordmapping,
common.tables$instrumentsmap,
common.tables$category)
instrument.tables <- get.instrument.tables(wordbank, common.tables$instruments)
# TEMPORARY HACK
instrument.tables$has_grammar <- c(0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0)
grammar.languages <- unique(filter(instrument.tables, has_grammar == 1)$language)
languages <- unique(instrument.tables$language)
Show number of items in each relevant section.
sections <- items %>%
filter(form == "WS") %>%
group_by(language, type) %>%
summarise(n = n()) %>%
spread(type, n) %>%
select(language, word, word_form, complexity)
sections[is.na(sections)] = 0
kable(sections)
| language | word | word_form | complexity |
|---|---|---|---|
| Croatian | 717 | 0 | 0 |
| Danish | 725 | 29 | 33 |
| English | 680 | 25 | 37 |
| German | 588 | 0 | 0 |
| Italian | 670 | 0 | 0 |
| Norwegian | 731 | 33 | 42 |
| Russian | 728 | 0 | 0 |
| Spanish | 680 | 24 | 37 |
| Swedish | 710 | 0 | 0 |
| Turkish | 711 | 0 | 0 |
Show total number of administrations in each language.
n.admin <- admins %>%
group_by(language) %>%
summarise(n = n())
kable(n.admin)
| language | n |
|---|---|
| Croatian | 377 |
| Danish | 3038 |
| English | 5595 |
| German | 1183 |
| Italian | 658 |
| Norwegian | 10095 |
| Russian | 773 |
| Spanish | 1094 |
| Swedish | 900 |
| Turkish | 1861 |
Show number of administrations in each language by age group.
n.admin.age <- admins %>%
group_by(language, age.group) %>%
summarise(n = n()) %>%
spread(age.group, n)
kable(n.admin.age)
| language | (15,19] | (19,23] | (23,27] | (27,31] |
|---|---|---|---|---|
| Croatian | 81 | 105 | 114 | 77 |
| Danish | 725 | 857 | 736 | 720 |
| English | 2020 | 947 | 1241 | 1387 |
| German | 164 | 364 | 380 | 275 |
| Italian | 70 | 227 | 194 | 167 |
| Norwegian | 1236 | 2892 | 3176 | 2791 |
| Russian | 83 | 195 | 239 | 256 |
| Spanish | 260 | 322 | 290 | 222 |
| Swedish | 311 | 307 | 143 | 139 |
| Turkish | 465 | 500 | 460 | 436 |
Some utility functions for transforming data values.
get.coded.type <- function(type, complexity_category) {
if (type == "complexity") {
return(complexity_category)
} else {
return(type)
}
}
get.value <- function(type, value) {
if (type == "word_form" | type == "word") {
return(value == "produces")
} else if (type == "complexity") {
return(value == "complex")
}
}
Get kid by item data for wordform and complexity items all languages and aggregate them.
get.grammar.data <- function(lang) {
lang.grammar.items <- items %>%
filter(language == lang, form == "WS",
type == "word_form" | type == "complexity") %>%
rename(column = item.id) %>%
mutate(item.id = as.numeric(substr(column, 6, nchar(column)))) %>%
select(column, item.id, type, item, definition, complexity_category)
lang.instrument.table <- filter(instrument.tables, language == lang,
form == "WS")$table[[1]]
lang.grammar.data <- get.instrument.data(lang.instrument.table,
lang.grammar.items$column) %>%
left_join(lang.grammar.items) %>%
group_by(data_id, type) %>%
mutate(no_section = all(is.na(value))) %>%
filter(!no_section) %>%
mutate(value = ifelse(is.na(value), "", value),
value = get.value(unique(type), value),
coded.type = get.coded.type(unique(type), complexity_category),
coded.type = factor(coded.type,
levels = c("word_form", "morphology", "syntax"),
labels = c("Word Form",
"Complexity (Morphological)",
"Complexity (Syntactic)")),
measure = factor(type, levels = c("word_form", "complexity"),
labels = c("Word Form", "Complexity"))) %>%
ungroup() %>%
select(-complexity_category, -no_section, -type, -column)
num.words <- nrow(filter(items, language == lang, form == "WS", type == "word"))
lang.admins <- admins %>%
filter(language == lang) %>%
select(data_id, age, age.group, production, language) %>%
mutate(vocab.mean = production / num.words)
lang.data <- left_join(lang.grammar.data, lang.admins) %>%
filter(age > 15 & age < 32)
return(lang.data)
}
grammar.data <- bind_rows(sapply(grammar.languages, get.grammar.data, simplify = FALSE))
Get by kid summary data for all languages.
grammar.summary <- grammar.data %>%
group_by(language, measure, data_id, age, age.group, vocab.mean) %>%
summarise(sum = sum(value),
diff = length(value) - sum,
mean = sum / length(value))
Fit grammar score models and use them to predict data.
grammar.models <- grammar.summary %>%
group_by(language, measure) %>%
do(model = glm(cbind(sum, diff) ~ vocab.mean + age.group,
data = ., family="binomial"))
get.grammar.model <- function(lang, meas) {
return(filter(grammar.models, language == lang, measure == meas)$model[[1]])
}
grammar.predicted.data <- grammar.summary %>%
group_by(language, measure) %>%
mutate(predicted = invlogit(predict.lm(get.grammar.model(unique(language),
unique(measure)),
data = ., family="binomial")))
Plot score as a function of vocabulary size for each language and measure with model prediction curves.
ggplot(grammar.predicted.data, aes(x = vocab.mean, y = mean,
colour = age.group, fill = age.group,
label = age.group)) +
geom_jitter(alpha=.3, size=.75) +
geom_line(aes(y=predicted),size=0.65) +
facet_grid(language~measure) +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
name = "\nVocabulary Size") +
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,.25),
"Score (Mean Items)\n") +
theme_bw(base_size = 11) +
theme(legend.position = c(0.06,0.92),
legend.text = element_text(size=7),
legend.title = element_text(size=7),
legend.key.height = unit(0.7, "char"),
legend.key.width = unit(0.4, "cm"),
legend.key = element_blank(),
legend.background = element_rect(fill="transparent"),
text=element_text(family=font)) +
scale_color_brewer(type="div", palette=9,
name="Age Group\n (months)") +
scale_fill_brewer(palette = "Spectral",
guide=FALSE)
Model comparison: fit grammar models and get their AICs and age coefficients.
grammar.model.metrics <- grammar.summary %>%
group_by(language, measure) %>%
do(model.vocab = glm(cbind(sum,diff) ~ vocab.mean, data = .,
family="binomial"),
model.vocab.age = glm(cbind(sum,diff) ~ vocab.mean + age, data = .,
family="binomial"),
model.vocab.age = glm(cbind(sum,diff) ~ vocab.mean * age, data = .,
family="binomial")) %>%
mutate(AIC.vocab = AIC(model.vocab),
AIC.vocab.age = AIC(model.vocab.age),
deltaAIC = AIC.vocab - AIC.vocab.age,
age.coef = coef(model.vocab.age)["age"],
age.se = se.coef(model.vocab.age)["age"])
Show AICs of grammar models.
kable(select(grammar.model.metrics,
language, measure, AIC.vocab, AIC.vocab.age, deltaAIC))
| language | measure | AIC.vocab | AIC.vocab.age | deltaAIC |
|---|---|---|---|---|
| Danish | Word Form | 11529.402 | 11508.696 | 20.70574 |
| Danish | Complexity | 13774.059 | 13324.288 | 449.77111 |
| English | Word Form | 18762.575 | 18664.378 | 98.19717 |
| English | Complexity | 30320.285 | 28695.682 | 1624.60326 |
| Norwegian | Word Form | 52851.103 | 52771.409 | 79.69365 |
| Norwegian | Complexity | 77467.584 | 74612.829 | 2854.75512 |
| Spanish | Word Form | 6285.671 | 6172.703 | 112.96810 |
| Spanish | Complexity | 10596.578 | 9700.317 | 896.26133 |
Plot age effect coefficients for each language and measure.
ggplot(grammar.model.metrics,
aes(x=language, y=age.coef, fill=measure)) +
geom_bar(position="dodge", stat="identity") +
geom_linerange(aes(ymin=age.coef-1.96*age.se, ymax=age.coef+1.96*age.se),
position = position_dodge(width=.9)) +
ylab("Age effect coefficient") +
xlab("") +
theme_bw(base_size = 14) +
scale_fill_brewer(palette="Set2",
name="Measure") +
theme(legend.position = c(0.15,0.86),
legend.text = element_text(size=14),
legend.title = element_text(size=15),
legend.key.height = unit(1.5, "char"),
legend.key = element_blank(),
legend.background = element_rect(fill="transparent"),
text=element_text(family=font))
Fit models for each wordform and complexity item and get their age coefficients.
item.models <- grammar.data %>%
group_by(language, item, coded.type) %>%
do(model = glm(value ~ vocab.mean + age, data = ., family = "binomial")) %>%
mutate(coef = coef(model)["age"],
se = se.coef(model)["age"])
Function for plotting age effect coefficients by item for a language.
plot.item.coefs <- function(item.models, lang) {
lang.item.models <- filter(item.models, language == lang) %>%
arrange(coef) %>%
mutate(item = factor(item, levels=item))
item.plot <- ggplot(lang.item.models,
aes(x=item, y=coef, fill=coded.type, label=item)) +
geom_bar(stat="identity", position="identity", alpha=.5, width=0.9) +
geom_linerange(aes(ymin=coef-1.96*se, ymax=coef+1.96*se),
position = position_dodge(width=.9)) +
theme_bw(base_size = 12) +
scale_y_continuous(name="Age Effect Coefficient") +
scale_x_discrete(name="",breaks=NULL) +
annotate("text", x = length(lang.item.models$item)/2,
y = min(lang.item.models$coef-1.96*lang.item.models$se), vjust=0,
label = lang, size = 9, family=font) +
scale_fill_brewer(palette="Set2", name="Item Type", drop=FALSE) +
theme(legend.position = c(0.22,0.82),
legend.text = element_text(size=14),
legend.title = element_text(size=13),
legend.key = element_blank(),
legend.key.height = unit(1.5, "char"),
text = element_text(family=font),
axis.title.y = element_text(size=16),
axis.text.y = element_text(size=13))
return(item.plot)
}
Plot item interactions for Norwegian.
plot.item.coefs(item.models, "Norwegian")
Plot item interactions for English.
plot.item.coefs(item.models, "English")
Plot item interactions for Danish.
plot.item.coefs(item.models, "Danish")
Plot item interactions for Spanish.
plot.item.coefs(item.models, "Spanish")
Get vocabulary composition data for all languages.
get.vocab.composition <- function(lang) {
lang.vocab.items <- filter(items, language == lang, form == "WS", type == "word") %>%
rename(column = item.id) %>%
mutate(item.id = as.numeric(substr(column, 6, nchar(column))))
lang.instrument.table <- filter(instrument.tables, language == lang,
form == "WS")$table[[1]]
lang.vocab.data <- get.instrument.data(lang.instrument.table,
lang.vocab.items$column) %>%
left_join(select(lang.vocab.items, item.id, lexical_category, item, definition)) %>%
mutate(value = ifelse(is.na(value), "", value),
value = get.value("word", value))
num.words <- nrow(lang.vocab.items)
lang.admins <- admins %>%
filter(language == lang) %>%
select(data_id, age, age.group, production, language) %>%
mutate(vocab.mean = production / num.words)
lang.vocab.summary <- left_join(lang.vocab.data, lang.admins) %>%
filter(age > 15 & age < 32) %>%
group_by(data_id, lexical_category, age, age.group, vocab.mean, language) %>%
summarise(sum = sum(value),
diff = length(value) - sum,
mean = sum / length(value))
return(lang.vocab.summary)
}
vocab.composition <- bind_rows(sapply(languages, get.vocab.composition,
simplify = FALSE)) %>%
filter(lexical_category != "other", lexical_category != "unknown") %>%
mutate(lexical_category = factor(lexical_category,
levels=c("nouns", "predicates", "function_words"),
labels=c("Nouns", "Predicates", "Function Words")))
Fit vocabulary composition models and use them to predict data.
vocab.models <- vocab.composition %>%
group_by(language, lexical_category) %>%
do(model = glm(cbind(sum, diff) ~ vocab.mean,
data = ., family = "binomial"))
get.vocab.model <- function(lang, cat) {
return(filter(vocab.models, language == lang, lexical_category == cat)$model[[1]])
}
vocab.predicted.data <- vocab.composition %>%
group_by(language, lexical_category) %>%
mutate(predicted = invlogit(predict.lm(get.vocab.model(unique(language),
unique(lexical_category)),
data = ., family = "binomial")))
Plot vocabulary composition as a function of vocabulary size for each language with model prediction curves.
ggplot(vocab.predicted.data,
aes(x=vocab.mean, y=mean, colour=lexical_category, label=lexical_category)) +
geom_jitter(alpha=0.15, size=.75) +
geom_line(aes(y=predicted),size=0.65) +
facet_wrap(~ language) +
scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
name = "Proportion of Category\n") +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
name = "\nVocabulary Size") +
theme_bw(base_size=12) +
theme(legend.position = c(0.065,0.95),
legend.text = element_text(size=9),
legend.title = element_text(size=9, lineheight=unit(0.8, "char")),
legend.key.height = unit(0.8, "char"),
legend.key.width = unit(0.3, "cm"),
legend.key = element_blank(),
legend.background = element_rect(fill="transparent"),
text = element_text(family=font)) +
scale_color_brewer(palette = "Set2", name = "Lexical Category")
Fit vocabulary composition models and get their AICs and age coefficients.
vocab.model.metrics <- vocab.composition %>%
group_by(language, lexical_category) %>%
do(model.vocab = glm(cbind(sum, diff) ~ vocab.mean, data = .,
family="binomial"),
model.vocab.age = glm(cbind(sum, diff) ~ vocab.mean + age, data = .,
family="binomial")) %>%
mutate(AIC.vocab = AIC(model.vocab),
AIC.vocab.age = AIC(model.vocab.age),
deltaAIC = AIC.vocab - AIC.vocab.age,
age.coef = coef(model.vocab.age)["age"],
age.se = se.coef(model.vocab.age)["age"])
Show AICs of vocabulary composition models.
kable(select(vocab.model.metrics,
language, lexical_category, AIC.vocab, AIC.vocab.age, deltaAIC))
| language | lexical_category | AIC.vocab | AIC.vocab.age | deltaAIC |
|---|---|---|---|---|
| Croatian | Nouns | 6408.926 | 6291.078 | 117.8476177 |
| Croatian | Predicates | 4179.132 | 4181.121 | -1.9889332 |
| Croatian | Function Words | 4341.321 | 4330.275 | 11.0456943 |
| Danish | Nouns | 52872.674 | 52609.708 | 262.9656407 |
| Danish | Predicates | 27314.629 | 27223.805 | 90.8244673 |
| Danish | Function Words | 27541.197 | 27393.853 | 147.3439923 |
| English | Nouns | 86115.764 | 86107.445 | 8.3184184 |
| English | Predicates | 51090.656 | 50878.709 | 211.9466748 |
| English | Function Words | 53702.533 | 53404.564 | 297.9689868 |
| German | Nouns | 16395.952 | 16387.158 | 8.7931039 |
| German | Predicates | 9384.208 | 9379.200 | 5.0088950 |
| German | Function Words | 11516.074 | 11466.015 | 50.0597137 |
| Italian | Nouns | 9899.290 | 9892.355 | 6.9350636 |
| Italian | Predicates | 6940.561 | 6940.308 | 0.2539017 |
| Italian | Function Words | 7052.579 | 6845.269 | 207.3103058 |
| Norwegian | Nouns | 148432.089 | 148423.959 | 8.1295136 |
| Norwegian | Predicates | 90151.776 | 89814.037 | 337.7392848 |
| Norwegian | Function Words | 104555.150 | 104021.522 | 533.6285278 |
| Russian | Nouns | 13121.501 | 13113.062 | 8.4394271 |
| Russian | Predicates | 8146.314 | 8110.742 | 35.5720150 |
| Russian | Function Words | 8644.789 | 8226.644 | 418.1454301 |
| Spanish | Nouns | 19414.464 | 19368.882 | 45.5813467 |
| Spanish | Predicates | 13596.813 | 13594.366 | 2.4472487 |
| Spanish | Function Words | 13253.564 | 13121.481 | 132.0835721 |
| Swedish | Nouns | 15355.704 | 15116.197 | 239.5076328 |
| Swedish | Predicates | 8272.710 | 8128.340 | 144.3698614 |
| Swedish | Function Words | 7046.174 | 6840.327 | 205.8470912 |
| Turkish | Nouns | 27188.128 | 27165.389 | 22.7390742 |
| Turkish | Predicates | 27484.975 | 27387.501 | 97.4744903 |
| Turkish | Function Words | 20898.234 | 20889.244 | 8.9896409 |
Plot age effect coefficients for each language and lexical category.
ggplot(vocab.model.metrics,
aes(x=language, y=age.coef, fill=lexical_category)) +
geom_bar(position="dodge", stat="identity") +
geom_linerange(aes(ymin=age.coef-1.96*age.se, ymax=age.coef+1.96*age.se),
position = position_dodge(width=.9)) +
ylab("Age effect coefficient") +
xlab("") +
theme_bw(base_size = 14) +
scale_fill_brewer(palette = "Set2",
name = "Lexical Category") +
theme(legend.position = c(0.115,0.82),
legend.text = element_text(size=13),
legend.title = element_text(size=13),
legend.key.height = unit(1.5, "char"),
legend.key = element_blank(),
legend.background = element_rect(fill="transparent"),
text = element_text(family=font))